Answering several data analysis questions using our RSQLite database setup in the previous post
In this post, we demonstrate how to answer a number of questions related to room listing type and host info using our newly created RSQLite database.
We answer the following 11 questions:
First, we must load the necessary libraries required for our data analysis and connect to our database.
Next, we connect to our database that we set up in the previous post
build_airbnb_database() FunctionIf you have not yet set up the require database, you may use the build_airbnb_database() function to do so for you. Note that this function leverages the remove_old_database() and insert_to_sql() functions defined in the previous post.
build_airbnb_database <- function(con, listing_data, remove_old_database = FALSE){
#################### Remove Existing database
if(remove_old_database == TRUE){
remove_live_database(con)
}
#################### Deal with NA values
listing_data <-
listing_data %>%
# Convert dates to characters for NA values
mutate(last_scraped = as.character(last_scraped),
host_since = as.character(host_since),
calendar_last_scraped = as.character(calendar_last_scraped),
first_review = as.character(first_review),
last_review = as.character(last_review),
) %>%
# Homogenize NA values
#*# Taken from: https://rpubs.com/Argaadya/create_table_sql
mutate_all(function(x) ifelse(x == "" | x == "None" | x == "N/A", NA, x)) %>% #*#
# mutate_all(function(x) ifelse(is.na(x), "NULL", x)) %>%
# Convert character strings back to date type
mutate(last_scraped = as.Date(last_scraped),
host_since = as.Date(host_since),
calendar_last_scraped = as.Date(calendar_last_scraped),
first_review = as.Date(first_review),
last_review = as.Date(last_review))
#################### Extract host data
host_data <- listing_data %>%
select(host_id:host_identity_verified,
calculated_host_listings_count:calculated_host_listings_count_shared_rooms)
#################### Remove duplicate values
host_data <- host_data %>% distinct()
#################### Convert dates
# Note that this will need to converted back to type = date for analysis
host_data <- host_data %>% mutate(host_since = as.character(host_since))
#################### Clean host verification column
host_data <-
host_data %>%
mutate(host_verifications = str_remove_all(host_verifications, "[\\'\\[\\]]"))
#################### Create table for host info
query <- "CREATE TABLE host_info(
host_id INT,
host_url VARCHAR(50),
host_name VARCHAR(100),
host_since VARCHAR(50),
host_location VARCHAR(500),
host_about VARCHAR(10000),
host_response_time VARCHAR(50),
host_response_rate VARCHAR(50),
host_acceptance_rate VARCHAR(50),
host_is_superhost BOOLEAN,
host_thumbnail_url VARCHAR(500),
host_picture_url VARCHAR(500),
host_neighbourhood VARCHAR(50),
host_listings_count INT,
host_total_listings_count INT,
host_verifications VARCHAR(500),
host_has_profile_pic BOOLEAN,
host_identity_verified BOOLEAN,
calculated_host_listings_count INT,
calculated_host_listings_count_entire_homes INT,
calculated_host_listings_count_private_rooms INT,
calculated_host_listings_count_shared_rooms INT,
PRIMARY KEY(host_id)
)"
#################### Load host_info table
dbSendQuery(con, query)
#################### Check schema
res <- dbSendQuery(con, "PRAGMA table_info([host_info]);")
fetch(res)
dbClearResult(res)
#################### Insert data into host_info table
insert_to_sql(con, "host_info", host_data)
####################Listing table Processing####################
# listing_data %>% view()
listing_data %>% glimpse()
#################### Remove host_data columns
listing_data <- listing_data %>%
select( - names(host_data)[-1])
#################### Remove extraneous columns
listing_data <- listing_data %>%
select(-c(license, calendar_updated, bathrooms, scrape_id))
#################### Remove dollar signs from price column
listing_data <- listing_data %>%
mutate(price = str_remove_all(price, "[$,]") %>%
as.numeric()
)
#################### Transform amenities and host verification column
listing_data <- listing_data %>%
mutate(amenities = str_remove_all(amenities, "[\"\\'\\[\\]]"))
listing_data %>% glimpse()
#################### Convert dates to character
listing_data <-
listing_data %>%
mutate(last_scraped = as.character(last_scraped),
calendar_last_scraped = as.character(calendar_last_scraped),
first_review = as.character(first_review),
last_review = as.character(last_review))
#################### Create listing table
query_2 <- [1856 chars quoted with '"']
#################### Insert listing table into database
dbSendQuery(con, query_2)
#################### Insert data into listing table
insert_to_sql(con, "listing", listing_data)
#################### Extract tables from database
}
We can load the data from our database in either of the following ways:
host_info <- tbl(con, "host_info") %>% as.data.frame()
listing <- tbl(con, "listing") %>% as.data.frame()
# load host_info table
res_host_info <- dbSendQuery(con, "select * from host_info")
host_info <- fetch(res_host_info)
dbClearResult(res_host_info)
# load listing table
res_listing <- dbSendQuery(con, "select * from listing")
listing <- fetch(res_listing)
dbClearResult(res_listing)
We notice that while an accommodation may only have one score in terms of price, it is reviewed among several different dimensions:
We start by selecting the price and review related columns from the listing table, drop any rows that are missing review score, and filter out a spurious outlier. This leaves us with roughly 10,000 observations remaining, more than enough to theoretically examine correlation.
We defined the following function c_plot() to handle the repetitive plotting of price versus our eight different review dimensions.
# Function defining correlation plot
c_plot <- function(df, y_val, y_name, clr = "dodgerblue4"){
c_plot <- df %>%
ggplot(aes(x = price,
y = y_val)) +
geom_jitter(color = clr, alpha = 0.5) +
scale_x_log10(label = scales::number_format(big.mark = ",")) +
labs(x = "Price",
y = y_name,
title = y_name) + theme_tq()
return(c_plot)
}
We then construct our correlation graphs using the above defined c_plot() function. We also use the grid.arrange() function from the gridExtra library to help align our multiple plots for parallel examination.
# Colours for correlation plot
c <- c("Aquamarine4", "Sienna3")
# Build correlation plots
q5_1 <- c_plot(q5, q5$review_scores_rating, "Rating vs Price")
q5_2 <- c_plot(q5, q5$review_scores_accuracy, "Accuracy", clr = c[1])
q5_3 <- c_plot(q5, q5$review_scores_cleanliness, "Cleanliness", clr = c[1])
q5_4 <- c_plot(q5, q5$review_scores_checkin, "Check-in", clr = c[1])
q5_5 <- c_plot(q5, q5$review_scores_communication, "Communication", clr = c[2])
q5_6 <- c_plot(q5, q5$review_scores_location, "Location", clr = c[2])
q5_7 <- c_plot(q5, q5$review_scores_value, "Value", clr = c[2])
# Output correlation plots
q5_1
grid.arrange(q5_2, q5_3, q5_4, ncol = 3)
grid.arrange(q5_5, q5_6, q5_7, ncol = 3)
Our plots appear noisy, and while we can see a consistent pattern between price and all eight dimensions along which accommodations are reviewed, we can statistically test the correlation by:
cor.test() function each combination of price and review score# Conduct numerical correlation test
c1 <- cor.test(q5$price, q5$review_scores_rating)
c2 <- cor.test(q5$price, q5$review_scores_accuracy)
c3 <- cor.test(q5$price, q5$review_scores_cleanliness)
c4 <- cor.test(q5$price, q5$review_scores_checkin)
c5 <- cor.test(q5$price, q5$review_scores_communication)
c6 <- cor.test(q5$price, q5$review_scores_location)
c7 <- cor.test(q5$price, q5$review_scores_value)
# Extract correlation test confidence intervals
c1_int <- paste0(round(c1$conf.int[1], 2), ", ", round(c1$conf.int[2], 2))
c2_int <- paste0(round(c2$conf.int[1], 2), ", ", round(c2$conf.int[2], 2))
c3_int <- paste0(round(c3$conf.int[1], 2), ", ", round(c3$conf.int[2], 2))
c4_int <- paste0(round(c4$conf.int[1], 2), ", ", round(c4$conf.int[2], 2))
c5_int <- paste0(round(c5$conf.int[1], 2), ", ", round(c5$conf.int[2], 2))
c6_int <- paste0(round(c6$conf.int[1], 2), ", ", round(c6$conf.int[2], 2))
c7_int <- paste0(round(c7$conf.int[1], 2), ", ", round(c7$conf.int[2], 2))
# Construct data frame of confidence intervals for correlation plots
confidence_intervals <- c(c1_int, c2_int, c3_int, c4_int, c5_int,
c6_int, c7_int)
review_category <- c("Rating", "Accuracy", "Cleanliness", "Check–in",
"Communication", "Location", "Value")
correlation_df <- data.frame(review_category, confidence_intervals)
correlation_df %>% knitr::kable(align = c("c", "c"))
| review_category | confidence_intervals |
|---|---|
| Rating | -0.01, 0.03 |
| Accuracy | -0.01, 0.03 |
| Cleanliness | -0.02, 0.02 |
| Check–in | -0.01, 0.02 |
| Communication | -0.02, 0.02 |
| Location | -0.02, 0.02 |
| Value | -0.03, 0.01 |
Looking at our final table, we see that all confidence intervals contain the value 0, which we could interpret to mean that there is no significant correlation between room price and the review score. Users paying for cheap accommodations may be perfectly satisfied with their experience, while paying more does not necessarily guarantee you a higher level of satisfaction.
To see the geographical distribution of available accommodations to rent, we use the leaflet library to create an interactive map
q6 <- listing %>%
left_join(host_info, by = "host_id") %>%
select(host_id, host_name, listing_url, latitude, longitude, price,
review_scores_rating, number_of_reviews, neighbourhood_cleansed) %>%
replace_na(list(name = "No Name", host_name = "No Host Name"))
popup <- paste0("<b>", q6$name, "</b><br>",
"Listing ID: ", q6$id, "<br>",
"Host Name: ", q6$host_name, "<br>",
"Price: ", q6$price, "<br>",
"Review Scores Rating: ", ifelse(is.na(q6$review_scores_rating),
"No Review Yet", q6$review_scores_rating) , "<br>",
"Number of Reviews: ", q6$number_of_reviews, "<br>",
"<a href=", q6$listing_url, "> Click for more info</a>"
)
leaflet(data = q6) %>%
addTiles() %>%
addMarkers(lng = ~longitude,
lat = ~latitude,
popup = popup,
clusterOptions = markerClusterOptions())
We begin by joining together the two tables on the column host_id. We then select the necessary columns and create a new column called total_earnings which consists of the formula:
We then remove the columns containing NA values and perform a count after grouping by the attributes host_id and host_name. The same time, we calculate the average price. Then finally, select the columns we want and arrange in descending order by the total_earnings.
q7 <- listing %>%
left_join(host_info, by = "host_id") %>%
select(host_id, host_name, price,
review_scores_rating, minimum_nights, number_of_reviews) %>%
mutate(total_earnings = price * review_scores_rating * minimum_nights) %>%
drop_na() %>%
group_by(host_id, host_name) %>%
mutate(number_of_listing = n(),
average_price = mean(price)) %>%
ungroup() %>%
select(host_id, host_name, total_earnings, number_of_listing, average_price) %>%
arrange(desc(total_earnings))
We create two plots instead of just one to examine the top posts by revenue. The first plot examines the top 10 hosts by the number of listings they have. The second plot, depicts the top 10 hosts by their total earnings.
We plot the results using a similar process for both plots with main difference being that the y-axis for the top_host_by_listing plot is ordered by the number_of_listing column, While the top_host_by_earning is ordered by the total_earnings column
top_host_by_listing <-
q7 %>%
arrange(desc(number_of_listing)) %>%
select(host_name, number_of_listing) %>%
distinct() %>%
head(15) %>%
ggplot(aes(x = number_of_listing, y = host_name %>% reorder(number_of_listing))) +
geom_col(fill = "Skyblue3") +
labs(
title = "Top Host by # of Listings",
x = "Number of Listing",
y = "Host Name"
) +
theme_tq() +
theme(axis.text.x = element_text(face = "bold"),
axis.text.y = element_text(face = "bold"))
top_host_by_earning <-
q7 %>%
select(host_name, total_earnings) %>%
arrange(desc(total_earnings)) %>%
filter(total_earnings != 16242500) %>%
head(15) %>%
ggplot(aes(x = total_earnings, y = host_name %>% reorder(total_earnings))) +
geom_col(fill = "Aquamarine4") +
scale_x_continuous(labels = scales::number_format(big.mark = ",")) +
labs(
title = "Top Host by Total Earning",
x = "Total Eearning (in Baht)",
y = "Host Name"
) +
theme_tq() +
theme(axis.text.x = element_text(angle = 45, face = "bold",
vjust = 0.85, hjust = 0.89),
axis.text.y = element_text(face = "bold"))
We use the grid.arrange() function from the GridExtra library to view the two plots side-by-side to aid in direct comparison.
grid.arrange(top_host_by_listing, top_host_by_earning, ncol = 2)
We notice that the host Bee is the only one who appears in both Top 10 lists
q8 <- listing %>%
left_join(host_info, by = "host_id") %>%
select(host_id, host_name, review_scores_rating, host_is_superhost) %>%
drop_na() %>%
mutate(host_is_superhost = as.logical(host_is_superhost)) %>%
select(review_scores_rating, host_is_superhost)
q8_1 <-
q8[q8$host_is_superhost == FALSE, ] %>%
ggplot(aes(y = review_scores_rating, group = host_is_superhost)) +
geom_boxplot(fill = "Skyblue3") +
labs(
title = "Host Ratings",
subtitle = "Ratings Distribution",
x = "Host",
y = "Rating"
) + theme_tq() +
theme(axis.text.x = element_text(face = "bold"),
axis.text.y = element_text(face = "bold"))
q8_2 <-
q8[q8$host_is_superhost == TRUE, ] %>%
ggplot(aes(y = review_scores_rating, group = host_is_superhost)) +
geom_boxplot(fill = "Aquamarine3") +
labs(
title = "Superhost Ratings",
subtitle = "Ratings Distribution",
x = "Superhost",
y = "Rating"
) + theme_tq() +
theme(axis.text.x = element_text(face = "bold"),
axis.text.y = element_text(face = "bold"))
grid.arrange(q8_1, q8_2, ncol = 2)
host <- q8[q8$host_is_superhost == FALSE, ]
super_host <- q8[q8$host_is_superhost == TRUE, ]
t.test(host$review_scores_rating, super_host$review_scores_rating)
Welch Two Sample t-test
data: host$review_scores_rating and super_host$review_scores_rating
t = -25.494, df = 9990.9, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.370689 -0.317755
sample estimates:
mean of x mean of y
4.450956 4.795178
q9 <- listing %>%
left_join(host_info, by = "host_id") %>%
select(host_id, host_name, host_response_rate, host_acceptance_rate, host_is_superhost) %>%
drop_na() %>%
mutate(host_is_superhost = as.logical(host_is_superhost),
# Transform acceptance rate and response rate
host_response_rate = host_response_rate %>%
str_remove("[%]") %>%
as.numeric(),
host_acceptance_rate = host_acceptance_rate %>%
str_remove("[%]") %>%
as.numeric()
)
q9_1 <-
q9[q9$host_is_superhost == FALSE, ] %>%
ggplot(aes(y = host_response_rate, group = host_is_superhost)) +
geom_boxplot(fill = "Skyblue3") +
labs(
title = "Host Response Rate",
subtitle = "Ratings Distribution",
x = "Host",
y = "Rating"
) + theme_tq() +
theme(axis.text.x = element_text(face = "bold"),
axis.text.y = element_text(face = "bold"))
q9_2 <-
q9[q9$host_is_superhost == TRUE, ] %>%
ggplot(aes(y = host_response_rate, group = host_is_superhost)) +
geom_boxplot(fill = "Aquamarine3") +
labs(
title = "Superhost Response Rate",
subtitle = "Ratings Distribution",
x = "Superhost",
y = "Rating"
) + theme_tq() +
theme(axis.text.x = element_text(face = "bold"),
axis.text.y = element_text(face = "bold"))
grid.arrange(q9_1, q9_2, ncol = 2)
host_2 <- q9[q9$host_is_superhost == FALSE, ]
super_host_2 <- q9[q9$host_is_superhost == TRUE, ]
t.test(host_2$host_response_rate, super_host_2$host_response_rate)
Welch Two Sample t-test
data: host_2$host_response_rate and super_host_2$host_response_rate
t = -35.4, df = 6805.1, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-17.87261 -15.99706
sample estimates:
mean of x mean of y
80.24118 97.17602
We begin by extracting and isolating the necessary data using the following steps:
host_since_date to the type date using the as.Date() function.host_since column into three separate columns for Year, month and day respectively.day column.Year and Month and use the count() function to tabulate the results.NA valuesq11 <-
host_info %>% # 1
left_join(listing, by = "host_id") %>%
select(host_id, host_since) %>% # 2
mutate(host_since_date = as.Date(host_since)) %>% # 3
separate("host_since", c("Year", "Month", "Day"), sep = "-") %>% # 4
select(-Day) %>% # 5
group_by(Year, Month) %>% # 6
count(Year, Month) %>%
ungroup() %>%
mutate(year_month = paste0(Year, "-", Month, "-", "01"), # 7
year_month_2 = paste0(Year, "-", Month),
joined = n) %>%
select(year_month, year_month_2, joined) %>%
mutate(year_month = as.Date(year_month)) %>%
drop_na() # 8
Next, we use the ggplot library’s geom_line() function to plot the data as a time series.
q11 %>%
ggplot(aes(x = year_month, y = joined)) +
geom_line(size = 1.2, colour = "Aquamarine4") +
scale_x_date(breaks = waiver(), date_breaks = "6 months") + theme_tq() +
theme(axis.text.x = element_text(angle = 45, face = "bold", vjust = 0.65),
axis.text.y = element_text(face = "bold")) +
labs(
title = "Number of hosts joined",
subtitle = "Shows the frequency rate at which new posts sign up for airbnb",
caption = "",
x = "Joined",
y = "Year/Month")
Since we are dealing with data over a number of years, it is helpful to also compile a list of the top 10 most active months in terms of new hosts
q11 %>%
select(-year_month) %>%
mutate(year_month = as.yearmon(year_month_2)) %>%
select(-year_month_2) %>%
select(year_month, joined) %>%
arrange(desc(joined)) %>%
head(10) %>% knitr::kable(align = c("c", "c"))
| year_month | joined |
|---|---|
| Jul 2018 | 405 |
| Jun 2019 | 366 |
| Jul 2015 | 358 |
| Jul 2019 | 311 |
| Aug 2015 | 300 |
| Sep 2018 | 295 |
| Apr 2016 | 284 |
| Dec 2015 | 267 |
| May 2017 | 261 |
| Dec 2018 | 259 |
We see that the summer months (June-August), especially in recent years, makeup almost all of the busiest months in terms of new hosts joining the service. Interestingly, December also has two months in the top 10.